
## A Research Note on the Prevalence
## of Housing Eviction among Children
## Born in American Cities

## CODE PART A:
## Data cleaning and model fitting calls to stan

## Ian Lundberg and Louis Donnelly

## Department of Sociology, Office of Population Research,
## and Center for Research on Child Wellbeing
## Princeton University

## Code by Ian Lundberg (ilundberg@princeton.edu)
## Last updated 10 May 2018

## Define the location where we will save figures
setwd("C:/Users/iandl/Documents/FF Eviction Prevalence")

## Define the number of imputations
n.imps <- 30
## Define the number of bootstrap samples for confidence intervals
bs.samps <- 10000
## Define the number of warmup draws for Stan
n.warmup <- 1000

save(n.imps, bs.samps, n.warmup,
     file = "Intermediate/specification_numbers.Rdata")

## Load required packages
library(tidyverse)
library(reshape2)
library(foreign)
library(Amelia)
library(doParallel)
library(doRNG)
library(readstata13)
library(rstan)

###################
## Read the data ##
###################

merged <- 
  ## Birth through year 5
  read.dta("Data/ff_res_merge4.dta",
           convert.factors = F) %>%
  ## Year 9
  left_join(
    read.dta("Data/y9merged_112011file.dta",
             convert.factors = F),
    by = "idnum"
  ) %>%
  ## Year 9 weights
  left_join(
    read.dta("Data/year9weights_082013res.dta",
             convert.factors = F),
    by = "idnum"
  ) %>%
  ## Year 15
  left_join(
    read.dta13("Data/ff_y15_res.dta",
                            convert.factors = F),
    by = "idnum"
  ) %>%
  ## Contextual data
  left_join(
    read.dta13(
      "Data/ffgeo_all_res.dta",
      convert.factors = F
    ) %>%
      select(idnum, tm1phisp, tm1pwhte, tm1pblck,
             tm1mrphi, tm1pfbpl),
    by = "idnum"
  )

## We want the labels for sample city
merged$m1city <- read.dta("Data/ff_res_merge4.dta",
                          convert.factors = T)[,"m1city"]

######################
## Create variables ##
######################

## Functions for cleaning rent variables

categorize_housing <- function(x) {
  case_when(
    x == 1 ~ "rent_unshared",
    x == 2 ~ "rent_contribute",
    x == 3 ~ "rent_noContribute",
    x == 4 ~ "own_unshared",
    x == 5 ~ "own_shared",
    x > 0 ~ "other",
    TRUE ~ NA_character_
  )
}

combine_cost <- function(housing, mortgage, rent) {
  case_when(
    (housing %in% c("own_unshared","own_shared") & mortgage > 0) ~ mortgage,
    (housing %in% c("rent_unshared","rent_contribute","rent_noContribute") & rent > 0) ~ rent
  )
}

## Clean variables
d <- transmute(
  merged,
  idnum = idnum,
  m1natwt = m1natwt,
  m1city = recode_factor(m1city,
                         `-9 Not in wave` = NA_character_,
                         `-8 Out of range` = NA_character_,
                         `-7 N/A` = NA_character_,
                         `-6 Skip` = NA_character_,
                         `-5 Not asked` = NA_character_,
                         `-4 Multiple ans` = NA_character_,
                         `-3 Missing` = NA_character_,
                         `-2 Don't know` = NA_character_,
                         `-1 Refuse` = NA_character_),
  ## Primary caregiver at each wave
  m2p = m2a3 %in% c(1,2), # child lives with mother at least half the time at wave 2
  m3p = m3a2 %in% c(1,2),
  m4p = m4a2 %in% c(1,2),
  m5p = m5a2 %in% c(1,2),
  f2p = f2a3 %in% c(1,2) & !m2p, # child lives with father at least half the time at wave 2
  f3p = f3a2 %in% c(1,2) & !m3p,
  f4p = f4a2 %in% c(1,2) & !m4p,
  f5p = f5a2 %in% c(1,2) & !m5p,
  
  ## Participation of each parent at each wave
  m2int = m2a3 != -9,
  m3int = m3a2 != -9,
  m4int = m4a2 != -9,
  m5int = m5a2 != -9,
  f2int = f2a3 != -9,
  f3int = f3a2 != -9,
  f4int = f4a2 != -9,
  f5int = f5a2 != -9,
  
  ## Each parents' report of eviction in each wave
  m2ev = ifelse(m2h19e <= 0, NA, m2h19e == 1),
  m3ev = ifelse(m3i23c <= 0, NA, m3i23c == 1),
  m4ev = ifelse(m4i23e <= 0, NA, m4i23e == 1),
  m5ev = ifelse(m5f23d <= 0, NA, m5f23d == 1),
  f2ev = ifelse(f2h17e <= 0, NA, f2h17e == 1),
  f3ev = ifelse(f3i23c <= 0, NA, f3i23c == 1),
  f4ev = ifelse(f4i23e <= 0, NA, f4i23e == 1),
  f5ev = ifelse(f5f23d <= 0, NA, f5f23d == 1),
  
  ## Keep deceased indicators for use later
  ## (note: year 15 indicator does not exist yet)
  cm2samp = cm2samp,
  cm3samp = cm3samp,
  cm4samp = cm4samp,
  cm5samp = cm5samp,
  cp6samp = cp6samp,
  
  ## Whether child experience eviction each wave, using one report only,
  ## replacing with 0 if child has ever been reported deceased before this wave
  ev2 = as.numeric(case_when(
    m2p == 1 ~ m2ev,
    f2p == 1 ~ f2ev
  )),
  ev3 = as.numeric(case_when(
    cm2samp == 2 ~ F,
    m3p == 1 ~ m3ev,
    f3p == 1 ~ f3ev
  )),
  ev4 = as.numeric(case_when(
    cm2samp == 2 | cm3samp == 2 ~ F,
    m4p == 1 ~ m4ev,
    f4p == 1 ~ f4ev
  )),
  ev5 = as.numeric(case_when(
    cm2samp == 2 | cm3samp == 2 | cm4samp == 2 ~ F,
    m5p == 1 ~ m5ev,
    f5p == 1 ~ f5ev
  )),
  ev6 = as.numeric(case_when(
    cm2samp == 2 | cm3samp == 2 | cm4samp == 2 | cm5samp == 2 ~ F,
    p6j40 >= 0 ~ p6j40 == 1
  )),
  ev6long = as.numeric(case_when(
    cm2samp == 2 | cm3samp == 2 | cm4samp == 2 | cm5samp == 2 ~ F,
    p6j51 >= 0 | ev6 == 1 ~ p6j51 == 1 | ev6 == 1
  )),
  any.ev.retro = as.numeric((ev2 + ev3 + ev4 + ev5 + ev6long) >= 1),
  multiple.ev.retro = as.numeric((ev2 + ev3 + ev4 + ev5 + ev6long) >= 2),
  
  ## Whether this is a valid report (useful for MI stage)
  valid2 = !is.na(ev2),
  valid3 = !is.na(ev3),
  valid4 = !is.na(ev4),
  valid5 = !is.na(ev5),
  valid6 = !is.na(ev6), 
  valid6long = !is.na(ev6long),
  
  ## PREDICTORS
  cm1ethrace = factor(case_when(cm1ethrace == 4 ~ as.integer(1),
                                cm1ethrace > 0 ~ cm1ethrace),
                      labels = c("Other","Black","Hispanic")),
  cm1edu = factor(case_when(cm1edu > 0 ~ cm1edu),
                  labels = c("Less than HS","HS","Some college","College")),
  married = ifelse(cm1relf <= 0, NA, cm1relf == 1),
  cm1age = ifelse(cm1age <= 0, NA, cm1age),
  m1imm = as.numeric(ifelse(m1h2 <= 0, NA, m1h2 == 2)), # foreign-born indicator
  
  ## Household income in every wave
  ## Pull from appropriate parent's report and top-code at 5
  ## At birth treat mother as appropriate report
  inc1 = case_when(cm1inpov >= 5 ~ 5,
                   cm1inpov >= 0 ~ cm1inpov),
  inc2 = case_when(m2p == 1 & cm2povco >= 5 ~ 5,
                   m2p == 1 & cm2povco >= 0 ~ cm2povco,
                   f2p == 1 & cf2povco >= 5 ~ 5,
                   f2p == 1 & cf2povco >= 0 ~ cf2povco),
  inc3 = case_when(m3p == 1 & cm3povco >= 5 ~ 5,
                   m3p == 1 & cm3povco >= 0 ~ cm3povco,
                   f3p == 1 & cf3povco >= 5 ~ 5,
                   f3p == 1 & cf3povco >= 0 ~ cf3povco),
  inc4 = case_when(m4p == 1 & cm4povco >= 5 ~ 5,
                   m4p == 1 & cm4povco >= 0 ~ cm4povco,
                   f4p == 1 & cf4povco >= 5 ~ 5,
                   f4p == 1 & cf4povco >= 0 ~ cf4povco),
  inc5 = case_when(m5p == 1 & cm5povco >= 5 ~ 5,
                   m5p == 1 & cm5povco >= 0 ~ cm5povco,
                   f5p == 1 & cf5povco >= 5 ~ 5,
                   f5p == 1 & cf5povco >= 0 ~ cf5povco),
  inc6 = case_when(cp6povco >= 5 ~ 5,
                   cp6povco >= 0 ~ cp6povco),
  
  
  ## Rent and mortgage payments
  m2housing = categorize_housing(m2h2),
  m3housing = categorize_housing(m3i2),
  m4housing = case_when(m4i1 != 1 ~ m3housing, # if haven't moved, bring forward
                        T ~ categorize_housing(m4i2)),
  m5housing = case_when(m5f1 != 1 ~ m4housing, # if haven't moved, bring forward
                        T ~ categorize_housing(m5f2)),
  f2housing = categorize_housing(f2h2),
  f3housing = categorize_housing(f3i2),
  f4housing = case_when(f4i1 != 1 ~ f3housing, # if haven't moved, bring forward
                        T ~ categorize_housing(f4i2)),
  f5housing = case_when(f5f1 != 1 ~ f4housing, # if haven't moved, bring forward
                        T ~ categorize_housing(f5f2)),
  p6housing = categorize_housing(p6j6),
  m2mortgage = m2h3b,
  m3mortgage = m3i3b,
  m4mortgage = case_when(m4i1 != 1 ~ m3mortgage, # if haven't moved, bring forward
                         T ~ m4i3b),
  m5mortgage = case_when(m5f1 != 1 ~ m4mortgage, # if haven't moved, bring forward
                         T ~ m5f3b),
  f2mortgage = f2h3b,
  f3mortgage = f3i3b,
  f4mortgage = case_when(f4i1 != 1 ~ f3mortgage, # if haven't moved, bring forward
                         T ~ f4i3b),
  f5mortgage = case_when(f5f1 != 1 ~ f4mortgage, # if haven't moved, bring forward
                         T ~ f5f3b),
  p6mortgage = p6j9,
  m2rent = m2h4,
  m3rent = m3i4,
  m4rent = case_when(m4i1 != 1 ~ m3rent, # if haven't moved, bring forward
                     T ~ m4i4),
  m5rent = case_when(m5f1 != 1 ~ m4rent, # if haven't moved, bring forward
                     T ~ m5f4),
  f2rent = f2h4,
  f3rent = f3i4,
  f4rent = case_when(f4i1 != 1 ~ f3rent, # if haven't moved, bring forward
                     T ~ f4i4),
  f5rent = case_when(f5f1 != 1 ~ f4rent, # if haven't moved, bring forward
                     T ~ f5f4),
  p6rent = p6j11,
  m2cost = combine_cost(m2housing, m2mortgage, m2rent),
  m3cost = combine_cost(m3housing, m3mortgage, m3rent),
  m4cost = combine_cost(m4housing, m4mortgage, m4rent),
  m5cost = combine_cost(m5housing, m5mortgage, m5rent),
  f2cost = combine_cost(f2housing, f2mortgage, f2rent),
  f3cost = combine_cost(f3housing, f3mortgage, f3rent),
  f4cost = combine_cost(f4housing, f4mortgage, f4rent),
  f5cost = combine_cost(f5housing, f5mortgage, f5rent),
  p6cost = combine_cost(p6housing, p6mortgage, p6rent),
  rent_burden2 = case_when(
    m2p & cm2hhinc > 0 ~ 12 * m2cost / cm2hhinc,
    f2p & cf2hhinc > 0 ~ 12 * f2cost / cf2hhinc
  ),
  rent_burden3 = case_when(
    m3p & cm3hhinc > 0 ~ 12 * m3cost / cm3hhinc,
    f3p & cf3hhinc > 0 ~ 12 * f3cost / cf3hhinc
  ),
  rent_burden4 = case_when(
    m4p & cm4hhinc > 0 ~ 12 * m4cost / cm4hhinc,
    f4p & cf4hhinc > 0 ~ 12 * f4cost / cf4hhinc
  ),
  rent_burden5 = case_when(
    m5p & cm5hhinc > 0 ~ 12 * m5cost / cm5hhinc,
    f5p & cf5hhinc > 0 ~ 12 * f5cost / cf5hhinc
  ),
  rent_burden6 = case_when(
    cp6hhinc > 0 ~ 12 * p6cost / cp6hhinc
  ),
  ## Top-code all rent burdens
  rent_burden2 = case_when(rent_burden2 > 1 ~ 1,
                           TRUE ~ rent_burden2),
  rent_burden3 = case_when(rent_burden3 > 1 ~ 1,
                           TRUE ~ rent_burden3),
  rent_burden4 = case_when(rent_burden4 > 1 ~ 1,
                           TRUE ~ rent_burden4),
  rent_burden5 = case_when(rent_burden5 > 1 ~ 1,
                           TRUE ~ rent_burden5),
  rent_burden6 = case_when(rent_burden6 > 1 ~ 1,
                           TRUE ~ rent_burden6),
  ## Create indicators for housing types
  housing2 = case_when(m2p ~ case_when(grepl("rent",m2housing) ~ "Rental",
                                       grepl("own",m2housing) ~ "Owned",
                                       !is.na(m2housing) ~ "Other"),
                       f2p ~ case_when(grepl("rent",f2housing) ~ "Rental",
                                       grepl("own",f2housing) ~ "Owned",
                                       !is.na(f2housing) ~ "Other")),
  housing3 = case_when(m3p ~ case_when(grepl("rent",m3housing) ~ "Rental",
                                       grepl("own",m3housing) ~ "Owned",
                                       !is.na(m3housing) ~ "Other"),
                       f3p ~ case_when(grepl("rent",f3housing) ~ "Rental",
                                       grepl("own",f3housing) ~ "Owned",
                                       !is.na(f3housing) ~ "Other")),
  housing4 = case_when(m4p ~ case_when(grepl("rent",m4housing) ~ "Rental",
                                       grepl("own",m4housing) ~ "Owned",
                                       !is.na(m4housing) ~ "Other"),
                       f4p ~ case_when(grepl("rent",f4housing) ~ "Rental",
                                       grepl("own",f4housing) ~ "Owned",
                                       !is.na(f4housing) ~ "Other")),
  housing5 = case_when(m5p ~ case_when(grepl("rent",m5housing) ~ "Rental",
                                       grepl("own",m5housing) ~ "Owned",
                                       !is.na(m5housing) ~ "Other"),
                       f5p ~ case_when(grepl("rent",f5housing) ~ "Rental",
                                       grepl("own",f5housing) ~ "Owned",
                                       !is.na(f5housing) ~ "Other")),
  housing6 = case_when(grepl("rent",p6housing) ~ "Rental",
                       grepl("own",p6housing) ~ "Owned",
                       !is.na(p6housing) ~ "Other"),
  ## Make binary indicators since these can be continuous in the model
  ## and Amelia works better with cts than nominal variables
  own2 = as.numeric(housing2 == "Owned"),
  own3 = as.numeric(housing3 == "Owned"),
  own4 = as.numeric(housing4 == "Owned"),
  own5 = as.numeric(housing5 == "Owned"),
  own6 = as.numeric(housing6 == "Owned"),
  
  ## Child age in years
  age2 = case_when(m2p & cm2b_age > 0 ~ cm2b_age / 12,
                   f2p & cf2b_age > 0 ~ cf2b_age / 12),
  age3 = case_when(m3p & cm3b_age > 0 ~ cm3b_age / 12,
                   f3p & cf3b_age > 0 ~ cf3b_age / 12),
  age4 = case_when(m4p & cm4b_age > 0 ~ cm4b_age / 12,
                   f4p & cf4b_age > 0 ~ cf4b_age / 12),
  age5 = case_when(m5p & cm5b_age > 0 ~ cm5b_age / 12,
                   f5p & cf5b_age > 0 ~ cf5b_age / 12),
  age6 = case_when(cp6yagem > 0 ~ cp6yagem / 12),
  
  ## Identify date of survey report,
  ## subtracting 1/2 a year to get middle of reporting period
  date2 = case_when(
    m2p & m2intyr > 0 & m2intmon > 0 ~ m2intyr + (m2intmon - 1)/12,
    f2p & f2intyr > 0 & f2intmon > 0 ~ f2intyr + (f2intmon - 1)/12
  ) - .5,
  date3 = case_when(
    m3p & m3intyr > 0 & m3intmon > 0 ~ m3intyr + (m3intmon - 1)/12,
    f3p & f3intyr > 0 & f3intmon > 0 ~ f3intyr + (f3intmon - 1)/12
  ) - .5,
  date4 = case_when(
    m4p & m4intyr > 0 & m4intmon > 0 ~ m4intyr + (m4intmon - 1)/12,
    f4p & f4intyr > 0 & f4intmon > 0 ~ f4intyr + (f4intmon - 1)/12
  ) - .5,
  date5 = case_when(
    m5p & cm5intyr > 0 & cm5intmon > 0 ~ cm5intyr + (cm5intmon - 1)/12,
    f5p & cf5intyr > 0 & cf5intmon > 0 ~ cf5intyr + (cf5intmon - 1)/12
  ) - .5,
  date6 = case_when(
    cp6intyr > 0 & cp6intmon > 0 ~ cp6intyr + (cp6intmon - 1)/12
  ) - .5,
  
  ## Weights
  m1natwt = m1natwt,
  m2natwt = m2natwt,
  m3natwt = m3natwt,
  m4natwt = m4natwt,
  m5natwt = m5natwt,
  m1citywt = m1citywt,
  m2citywt = m2citywt,
  m3citywt = m3citywt,
  m4citywt = m4citywt,
  m5citywt = m5citywt,
  
  ## Mother's WAIS-R cognitive score
  cm3cogsc = ifelse(cm3cogsc >= 0, cm3cogsc, NA),
  
  ## Mother's impulsivity (Dickman)
  impulsivity = 25 - 
    ifelse(m3j44a >= 0, m3j44a, NA) +
    ifelse(m3j44b >= 0, m3j44b, NA) +
    ifelse(m3j44c >= 0, m3j44c, NA) +
    ifelse(m3j44d >= 0, m3j44d, NA) +
    ifelse(m3j44e >= 0, m3j44e, NA) +
    ifelse(m3j44f >= 0, m3j44f, NA),
  impulsivity = (impulsivity - mean(impulsivity, na.rm = T)) / sd(impulsivity, na.rm = T),
  
  ## Father ever in jail
  cmf1finjail = case_when(cmf1finjail >= 0 ~ cmf1finjail),
  cmf2fevjail = case_when(
    cmf2fevjail >= 0 ~ cmf2fevjail,
    cmf1finjail == 1 ~ as.integer(1)
  ),
  cmf3fevjail = case_when(
    cmf3fevjail >= 0 ~ cmf3fevjail,
    cmf2fevjail == 1 ~ as.integer(1)
  ),
  cmf4fevjail = case_when(
    cmf4fevjail >= 0 ~ cmf4fevjail,
    cmf3fevjail == 1 ~ as.integer(1)
  ),
  cmf5fevjail = case_when(
    cmf5fevjail >= 0 ~ cmf5fevjail,
    cmf4fevjail == 1 ~ 1
  ),
  
  ## Neighborhood characteristics
  prop.hisp = tm1phisp,
  prop.black = tm1pblck,
  prop.white = tm1pwhte,
  prop.all.other = 1 - (prop.hisp + prop.black + prop.white),
  prop.below.povline = tm1pfbpl,
  med.rent.over.income = tm1mrphi / 100
)

################
## IMPUTATION ##
################

toimpute <- c("ev2", "ev3", "ev4", "ev5", "ev6",
              "cm1ethrace", "cm1edu", "married", "cm1age", "m1imm", "m1city",
              "inc2", "inc3", "inc4", "inc5", "inc6",
              "rent_burden2", "rent_burden3", "rent_burden4",
              "rent_burden5", "rent_burden6",
              "own2", "own3", "own4", "own5", "own6",
              "age2","age3","age4","age5","age6",
              "date2","date3","date4","date5","date6",
              "impulsivity","cm3cogsc",
              "cmf2fevjail",
              "prop.hisp","prop.black","prop.all.other",
              "prop.below.povline","med.rent.over.income")

## Multiply impute
## For the main models
amelia.untransformed <- amelia(d[,c("idnum","m1natwt","m1citywt",toimpute)],
                               m = n.imps,
                               seed = 08544,
                               idvars = c("idnum","m1natwt","m1citywt"),
                               noms = c("cm1ethrace","m1city"),
                               ords = "cm1edu")

## We re-estimate multiple imputation with several different
## sets of variables to optimize for a simple imputation procedure
## for a few quantities of interest, such as retrospective
## eviction reports, reports of any eviction,
## and reports of multiple evictions.

## For descriptives including the retrospective report
## We do this separately because the age 15 1-year and retrospective
## reports are too highly correlated to go in a single model,
## and are missing for the same respondents anyway.
imputed.retro <- amelia(d[,c("idnum","m1natwt","m1citywt",
                             toimpute[-which(toimpute == "ev6")],
                             "ev6long")],
                        m = n.imps,
                        seed = 08544,
                        idvars = c("idnum","m1natwt","m1citywt"),
                        noms = c("cm1ethrace","m1city"),
                        ords = "cm1edu")

## Impute cumulative estimates of any eviction
## and of multiple evictions
set.seed(08544)
imputed.cumulative.estimates <- lapply(
  list(any.ev.retro = "any.ev.retro",
       multiple.ev.retro = "multiple.ev.retro"),
  function(outcome) {
    amelia(d[,c("idnum","m1natwt","m1citywt",
                toimpute[!grepl("^ev",toimpute)],
                outcome)],
           m = n.imps,
           seed = 08544,
           idvars = c("idnum","m1natwt","m1citywt"),
           noms = c("cm1ethrace","m1city"),
           ords = "cm1edu")
  }
)

## Post-process the main model imputations
amelia.out <- transform(
  amelia.untransformed,
  
  ## Produce indicators that are averaged over the 5 observations
  ## for rent burden,
  ## permanent income, which we make categorical to model flexibly,
  ## and proportion of years in which you live in an owned home
  rent_burden = 1 / 5 * (rent_burden2 + rent_burden3 + rent_burden4 + 
                           rent_burden5 + rent_burden6),
  income_mean = 1 / 5 * (inc2 + inc3 + inc4 + inc5 + inc6),
  income_mean_cat = factor((1 / 5 * (inc2 + inc3 + inc4 + inc5 + inc6) > .5) + 
                             (1 / 5 * (inc2 + inc3 + inc4 + inc5 + inc6) > 1) +
                             (1 / 5 * (inc2 + inc3 + inc4 + inc5 + inc6) > 2) + 
                             (1 / 5 * (inc2 + inc3 + inc4 + inc5 + inc6) > 3),
                           labels = c("Below 50 percent of poverty line",
                                      "50-100 percent of poverty line",
                                      "100-200 percent of poverty line",
                                      "200-300 percent of poverty line",
                                      "Above 300 percent of poverty line")),
  prop_own = 1 / 5 * (own2 + own3 + own4 + own5 + own6),
  
  ## Proportion of neighborhood that is white is perfectly collinear
  ## with proportions of other categories so was not directly imputed;
  ## impute here for use in descriptive statistics.
  prop.white = 1 - (prop.black + prop.hisp + prop.all.other),
  
  ## Mark imputed version of eviction
  ev.imp2 = ev2,
  ev.imp3 = ev3,
  ev.imp4 = ev4,
  ev.imp5 = ev5,
  ev.imp6 = ev6,
  
  ## Replace eviction (the outcome) with the non-imputed version
  ev2 = as.numeric(d$ev2),
  ev3 = as.numeric(d$ev3),
  ev4 = as.numeric(d$ev4),
  ev5 = as.numeric(d$ev5),
  ev6 = as.numeric(d$ev6)
)

## Store long form of imputations
imps.long <- lapply(amelia.out$imputations, function(imp) {
  return(
    reshape(imp %>%
              select(-inc2,-inc3,-inc4,-inc5,-inc6,
                     -own2,-own3,-own4,-own5,-own6,
                     -rent_burden2,-rent_burden3,-rent_burden4,
                     -rent_burden5,-rent_burden6),
            idvar = "idnum",
            varying = list(paste0("ev",2:6),
                           paste0("ev.imp",2:6),
                           paste0("age",2:6),
                           paste0("date",2:6)),
            v.names = c("ev","ev.imp","age","date"),
            timevar = "wave",
            direction = "long") %>%
      mutate(recession = date >= 2008 & date < 2010)
  )
})

## Bootstrap samples to summarize
## prevalence by wave
cl <- makeCluster(15)
registerDoParallel(cl)
set.seed(08544)
prevalenceByWave_samples <- foreach(
  i = 1:bs.samps, 
  .packages = "tidyverse",
  .combine = "rbind"
) %dorng% {
  imp <- i %% n.imps + 1
  amelia.out$imputations[[imp]] %>%
    filter(!is.na(m1natwt)) %>%
    sample_frac(size = 1,
                replace = T,
                weight = m1natwt) %>%
    select(ev.imp2,ev.imp3,ev.imp4,ev.imp5,ev.imp6) %>%
    summarise_all(.funs = mean)
}
stopCluster(cl)

################################
## Lower bound and            ##
## multiply imputed estimates ##
################################

## Lower bound
lowerBound <- d %>%
  select(ev2, ev3, ev4, ev5, ev6, ev6long, m1natwt) %>%
  filter(!is.na(m1natwt)) %>%
  mutate(ev2 = ifelse(is.na(ev2), 0, ev2),
         ev3 = ifelse(is.na(ev3), 0, ev3),
         ev4 = ifelse(is.na(ev4), 0, ev4),
         ev5 = ifelse(is.na(ev5), 0, ev5),
         ev6 = ifelse(is.na(ev6), 0, ev6),
         ev6long = ifelse(is.na(ev6long), 0, ev6long),
         any = ev2 | ev3 | ev4 | ev5 | ev6 | ev6long) %>%
  summarize(prop.evicted = weighted.mean(any, w = m1natwt))

## Bootstrap that estimate to get a CI
cl <- makeCluster(15)
registerDoParallel(cl)
set.seed(08544)
lowerBoundSamples <- foreach(
  i = 1:bs.samps, .packages = "tidyverse",
  .combine = "rbind"
) %dorng% {
  d %>%
    select(ev2, ev3, ev4, ev5, ev6, ev6long, m1natwt) %>%
    filter(!is.na(m1natwt)) %>%
    ## Sampling with weighting means we don't have to
    ## weight the estmate: the resampling accounts
    ## for the weights
    sample_frac(replace = T, weight = m1natwt) %>%
    mutate(ev2 = ifelse(is.na(ev2), 0, ev2),
           ev3 = ifelse(is.na(ev3), 0, ev3),
           ev4 = ifelse(is.na(ev4), 0, ev4),
           ev5 = ifelse(is.na(ev5), 0, ev5),
           ev6 = ifelse(is.na(ev6), 0, ev6),
           ev6long = ifelse(is.na(ev6long), 0, ev6long),
           any = ev2 | ev3 | ev4 | ev5 | ev6 | ev6long) %>%
    summarize(prop.evicted = mean(any))
}
stopCluster(cl)
lowerBoundCI <- lowerBoundSamples %>%
  summarize(CI.min = quantile(prop.evicted, .025),
            CI.max = quantile(prop.evicted, .975))

## Multiple imputation
mi <- mean(sapply(imputed.cumulative.estimates$any.ev.retro$imputations,
                  function(imp) weighted.mean(imp$any.ev.retro[!is.na(d$m1natwt)],
                                              w = imp$m1natwt[!is.na(d$m1natwt)])))
cl <- makeCluster(15)
registerDoParallel(cl)
set.seed(08544)
miSamples <- foreach(
  i = 1:bs.samps, .packages = "tidyverse",
  .combine = "rbind"
) %dorng% {
  imp <- i %% n.imps + 1
  imputed.cumulative.estimates$any.ev.retro$imputations[[imp]] %>%
    select(m1natwt, any.ev.retro) %>%
    filter(!is.na(m1natwt)) %>%
    ## Sampling with weighting means we don't have to
    ## weight the estmate: the resampling accounts
    ## for the weights
    sample_frac(replace = T, weight = m1natwt) %>%
    summarize(prop.evicted = mean(any.ev.retro))
}
stopCluster(cl)
miCI <- miSamples %>%
  summarize(CI.min = quantile(prop.evicted, .025),
            CI.max = quantile(prop.evicted, .975))

lowerBound_and_MI <- bind_rows(
  data.frame(estimand = "LowerBound",
             estimate = lowerBound$prop.evicted,
             lowerBoundCI),
  data.frame(estimand = "MI",
             estimate = mi,
             miCI)
)

############################
## PREPARE DATA FOR MODEL ##
##  TO FILL IN ALL YEARS  ##
############################

## Scale the data in preparation for the Stan fit
## Expanded will stack the imputations on top of each other
## with imp identifying which rows belong to which imputation
expanded <- do.call(rbind, imps.long) %>%
  mutate(imp = rep(1:n.imps, each = nrow(imps.long[[1]])),
         ## Make city IDs numeric 1-20
         #m1city = as.numeric(m1city),
         ## Make factor variables a series of dummies
         Black = cm1ethrace == "Black",
         Hispanic = cm1ethrace == "Hispanic",
         LessThanHS = cm1edu == "Less than HS",
         SomeCollege = cm1edu == "Some college",
         College = cm1edu == "College",
         IncBelow50PctPovLine = income_mean_cat == "Below 50 percent of poverty line",
         Inc50.100PctPovLine = income_mean_cat == "50-100 percent of poverty line",
         Inc100.200PctPovLine = income_mean_cat == "100-200 percent of poverty line",
         Inc200.300PctPovLine = income_mean_cat == "200-300 percent of poverty line") %>%
  select(imp, idnum, m1city, ev,
         Black, Hispanic,
         LessThanHS, SomeCollege, College,
         married, cm1age, m1imm,
         IncBelow50PctPovLine, Inc50.100PctPovLine,
         Inc100.200PctPovLine, Inc200.300PctPovLine,
         prop_own,rent_burden,
         impulsivity, cm3cogsc, cmf2fevjail,
         prop.hisp, prop.black, prop.all.other,
         prop.below.povline, med.rent.over.income,
         age, recession)

## Determine the mean value of all predictors
## and the factor by which we will scale
means <- sapply(expanded[,-(1:4)], mean)
scale.factors <- apply(expanded[,-(1:4)], 2,
                       function(x) {
                         if (length(unique(x)) == 2) {
                           scale.factor = 1 
                         } else {
                           scale.factor = 2*sd(x)
                         }
                         return(scale.factor)
                       })
## cmf5fevjail should have scale factor 1 since binary, even though
## some imputed values are not 0 or 1
scale.factors["cmf2fevjail"] <- 1

## Re-scale all predictors
scaled.long <- transmute(
  expanded,
  ev = ev,
  imp = imp,
  idnum = idnum,
  m1city = m1city,
  Black = (Black - means["Black"]) / scale.factors["Black"],
  Hispanic = (Hispanic - means["Hispanic"]) / scale.factors["Hispanic"],
  LessThanHS = (LessThanHS - means["LessThanHS"]) / scale.factors["LessThanHS"],
  SomeCollege = (SomeCollege - means["SomeCollege"]) / scale.factors["SomeCollege"],
  College = (College - means["College"]) / scale.factors["College"],
  married = (married - means["married"]) / scale.factors["married"],
  cm1age = (cm1age - means["cm1age"]) / scale.factors["cm1age"],
  m1imm = (m1imm - means["m1imm"]) / scale.factors["m1imm"],
  IncBelow50PctPovLine = (IncBelow50PctPovLine - means["IncBelow50PctPovLine"]) / scale.factors["IncBelow50PctPovLine"],
  Inc50.100PctPovLine = (Inc50.100PctPovLine - means["Inc50.100PctPovLine"]) / scale.factors["Inc50.100PctPovLine"],
  Inc100.200PctPovLine = (Inc100.200PctPovLine - means["Inc100.200PctPovLine"]) / scale.factors["Inc100.200PctPovLine"],
  Inc200.300PctPovLine = (Inc200.300PctPovLine - means["Inc200.300PctPovLine"]) / scale.factors["Inc200.300PctPovLine"],
  prop_own = (prop_own - means["prop_own"]) / scale.factors["prop_own"],
  rent_burden = (rent_burden - means["rent_burden"]) / scale.factors["rent_burden"],
  impulsivity = (impulsivity - means["impulsivity"]) / scale.factors["impulsivity"],
  cm3cogsc = (cm3cogsc - means["cm3cogsc"]) / scale.factors["cm3cogsc"],
  cmf2fevjail = (cmf2fevjail - means["cmf2fevjail"]) / scale.factors["cmf2fevjail"],
  prop.hisp = (prop.hisp - means["prop.hisp"]) / scale.factors["prop.hisp"],
  prop.black = (prop.black - means["prop.black"]) / scale.factors["prop.black"],
  prop.all.other = (prop.all.other - means["prop.all.other"]) / scale.factors["prop.all.other"],
  prop.below.povline = (prop.below.povline - means["prop.below.povline"]) / scale.factors["prop.below.povline"],
  med.rent.over.income = (med.rent.over.income - means["med.rent.over.income"]) / scale.factors["med.rent.over.income"],
  age = (age - means["age"]) / scale.factors["age"],
  recession = (recession - means["recession"]) / scale.factors["recession"]
)

## Define the model formulas of increasing complexity
modelForms <- list(
  # Null
  formula(~ -1),
  # Basic demographics
  formula(~ -1 + Black + Hispanic + 
            LessThanHS + SomeCollege + College + 
            cm1age + m1imm + 
            married),
  # + time variables
  formula(~ -1 + Black + Hispanic + 
            LessThanHS + SomeCollege + College + 
            cm1age + m1imm + 
            married +
            age + recession),
  # + income, home ownership, and rent burden
  formula(~ -1 + Black + Hispanic + 
            LessThanHS + SomeCollege + College + 
            cm1age + m1imm + 
            married +
            age + recession + 
            IncBelow50PctPovLine + Inc50.100PctPovLine + Inc100.200PctPovLine + Inc200.300PctPovLine +
            married:(IncBelow50PctPovLine + Inc50.100PctPovLine + Inc100.200PctPovLine + Inc200.300PctPovLine) +
            prop_own + rent_burden),
  # + parental characteristics
  formula(~ -1 + Black + Hispanic + 
            LessThanHS + SomeCollege + College + 
            cm1age + m1imm + 
            married +
            age + recession + 
            IncBelow50PctPovLine + Inc50.100PctPovLine + Inc100.200PctPovLine + Inc200.300PctPovLine +
            married:(IncBelow50PctPovLine + Inc50.100PctPovLine + Inc100.200PctPovLine + Inc200.300PctPovLine) +
            prop_own + rent_burden +
            impulsivity + cm3cogsc + cmf2fevjail),
  # + neighborhood characteristics
  formula(~ -1 + Black + Hispanic + 
            LessThanHS + SomeCollege + College + 
            cm1age + m1imm + 
            married +
            age + recession + 
            IncBelow50PctPovLine + Inc50.100PctPovLine + Inc100.200PctPovLine + Inc200.300PctPovLine +
            married:(IncBelow50PctPovLine + Inc50.100PctPovLine + Inc100.200PctPovLine + Inc200.300PctPovLine) +
            prop_own + rent_burden +
            impulsivity + cm3cogsc + cmf2fevjail +
            prop.hisp + prop.black + prop.all.other + 
            prop.below.povline + med.rent.over.income)
)


## Create a list of predictor matrices for the model
## where each element is 1 MI version of X
X_list <- lapply(modelForms, function(modelForm) {
  lapply(1:n.imps, function(i) {
    model.matrix(modelForm,
                 data = filter(scaled.long, imp == i & !is.na(ev)))
  })
})
  
## Define the data we will use to produce cumulative prevalence
## predictions given model parameters.
X.beforeExpansion <- scaled.long %>%
  group_by(imp, idnum) %>%
  mutate(count = 1:n()) %>%
  filter(count == 1) %>%
  select(-count) %>%
  group_by()
X.expanded <- X.beforeExpansion[rep(1:nrow(X.beforeExpansion), each = 15),] %>%
  group_by(idnum, imp) %>%
  mutate(age = 1:n()) %>%
  mutate(recession = ifelse(age >= 8 & age <= 9,1,0) - means["recession"],
         age = (age - means["age"]) / scale.factors["age"]) %>%
  select(-ev) %>%
  group_by()

## Make a clear linkage of the identifiers with the cities and respondents
IDs <- X.expanded %>%
  group_by(idnum) %>%
  mutate(index = 1:n()) %>%
  filter(index == 1) %>%
  group_by() %>%
  mutate(city = as.numeric(m1city),
         person = 1:n()) %>%
  select(idnum, person, m1city, city)

stan_data <- lapply(X_list, function(X) {
  lapply(
    1:n.imps, function(i) {
      list(N_city = 20,
           N_person = 4898,
           N = nrow(X[[1]]),
           city = (scaled.long %>%
                     filter(imp == i & !is.na(ev)) %>%
                     left_join(IDs, by = "idnum"))$city,
           person = (scaled.long %>%
                       filter(imp == i & !is.na(ev)) %>%
                       left_join(IDs, by = "idnum"))$person,
           Y = filter(scaled.long, imp == i & !is.na(ev))$ev,
           K = ncol(X[[i]]),
           X = X[[i]])
    }
  )
})

## Create a list of predictor matrices
## and individual and city identifers
## for the 15-year predictions
## where each element is 1 MI version of X

## That is nested within a list of the model formulas we consider

newData <- lapply(modelForms, function(modelForm) {
  lapply(1:n.imps, function(i) {
    list(
      city = (X.expanded %>%
                filter(imp == i) %>%
                left_join(IDs, by = "idnum"))$city,
      person = (X.expanded %>%
                  filter(imp == i) %>%
                  left_join(IDs, by = "idnum"))$person,
      X = model.matrix(modelForm,
                       data = filter(X.expanded, imp == i))
    )
  })
})

## Save data objects for use later
save(X.expanded, file = "Intermediate/X.expanded.Rdata")
save(IDs, file = "Intermediate/IDs.Rdata")
save(scaled.long, file = "Intermediate/scaled.long.Rdata")
save(means, file = "Intermediate/means.Rdata")
save(expanded, file = "Intermediate/expanded.Rdata")
save(scale.factors, file = "Intermediate/scale.factors.Rdata")
save(merged, file = "Intermediate/merged.Rdata")
save(d, file = "Intermediate/d.Rdata")
save(prevalenceByWave_samples,
     file = "Intermediate/prevalenceByWave_samples.Rdata")
save(amelia.untransformed, file = "Intermediate/amelia.untransformed.Rdata")
save(amelia.out, file = "Intermediate/amelia.out.Rdata")
save(imputed.retro, file = "Intermediate/imputed.retro.Rdata")
save(imputed.cumulative.estimates, file = "Intermediate/imputed.cumulative.estimates.Rdata")
save(lowerBound_and_MI,
     file = "Intermediate/lowerBound_and_MI.Rdata")
save(stan_data, file = "Intermediate/stan_data.Rdata")
save(newData, file = "Intermediate/newData.Rdata")
save(modelForms, file = "Intermediate/modelForms.Rdata")

###################
## FIT THE MODEL ##
###################

## Prepare for parallel computing
cl <- makeCluster(15, outfile = "Output/outfile.txt")
# In PowerShell, can tail that outfile with Get-Content Output/outfile.txt -Wait
registerDoParallel(cl)

source("Code/B_Eviction_Demography_Stan.R")

set.seed(08544)
stanPred <- foreach(
  re = c(F,T), #length(modelForms),
  .packages = c("tidyverse","rstan"),
  .combine = "rbind"
) %do% {
  pred_all_models <- foreach(
    i = 1:length(modelForms),
    .packages = c("tidyverse","rstan"),
    .combine = "rbind"
  ) %do% {
    pred_this_model <- foreach(
      j = 1:n.imps,
      .packages = c("tidyverse","rstan")
    ) %dorng% {
      print(paste("Fitting Stan on imputation",j,"of",n.imps,"for model",i,"of",length(modelForms),"with random intercepts",re))
      t0 <- Sys.time()
      if (re == T & stan_data[[i]][[j]]$K > 0) {
        model_code <- stan_code
      } else if (re == T & stan_data[[i]][[j]]$K == 0) {
        model_code <- stan_code_noCovs
      } else if (re == F & stan_data[[i]][[j]]$K > 0) {
        model_code <- stan_code_noRE
      } else if (re == F & stan_data[[i]][[j]]$K == 0) {
        model_code <- stan_code_noREorCovs
      }
      fit_this_imp <- stan(model_name = "eviction",
                           model_code = model_code,
                           data = stan_data[[i]][[j]],
                           chains = 1,
                           iter = n.warmup + bs.samps / n.imps + 1,
                           warmup = n.warmup,
                           save_warmup = F,
                           sample_file = paste0("Intermediate/stan_parameters/stan_re",re,"_model",i,"_imp_",j,".csv"))
      timeSpent <- difftime(Sys.time(),t0)
      posterior <- rstan::extract(fit_this_imp)
      X <- newData[[i]][[j]]$X
      city <- newData[[i]][[j]]$city
      person <- newData[[i]][[j]]$person
      
      if (stan_data[[i]][[j]]$K > 0) {
        ## Calculate the covariate-based portion of the prediction
        xb <- X %*% t(posterior$beta)
      } else {
        xb <- 0
      }
      
      if (re == T) {
        ## Calculate the intercept portion of the prediction
        intercept <- t(replicate(nrow(X),posterior$alpha)) +
          t(posterior$u_city[,city]) +
          t(posterior$u_person[,person])
      } else {
        intercept <- t(replicate(nrow(X),posterior$alpha))
      }
      
      ## Combine to make the probability of eviction
      p <- plogis(xb + intercept)
      
      ## Calculate the cumulative probability for each child
      ## This is a bit slow.
      cum.prob <- data.frame(person = person,
                             p) %>%
        group_by(person) %>%
        summarize_all(.funs = function(x) 1 - prod(1 - x)) %>%
        group_by()
      return(cum.prob)
    }
    result <- data.frame(model = rep(i,nrow(pred_this_model[[1]])),
                         re = rep(re, nrow(pred_this_model[[1]]))) %>%
      bind_cols(pred_this_model[[1]]) %>%
      rename_at(vars(-contains("person")), .funs = function(x) gsub("X","imp1_draw",x))
    if (n.imps > 1) {
      for (j in 2:n.imps) {
        result <- result %>%
          left_join(pred_this_model[[j]] %>%
                      rename_at(vars(-contains("person")), .funs = function(x) gsub("X",paste0("imp",j,"_draw"),x)),
                    by = "person")
      }
    }
    return(result)
  }
  return(pred_all_models)
}
save(stanPred,
     file = "Intermediate/stanPred.RData")
stopCluster(cl)

############################################
## Cross-validate on the first imputation ##
############################################

## Assign folds in blocks of nfold
nfold <- 5
set.seed(08544)
folds <- as.vector(
  replicate(ceiling(stan_data[[1]][[1]]$N / nfold), sample(1:nfold))
)[1:stan_data[[1]][[1]]$N]

cvData <- lapply(stan_data, function(model_data) {
  first_imp <- model_data[[1]]
  ## Create a data frame of weights corresponding
  ## to the rows in cvData
  weights <- data.frame(person = first_imp$person) %>%
    left_join(IDs, by = "person") %>%
    select(idnum) %>%
    left_join(d %>%
                select(idnum, m1natwt),
              by = "idnum")
  ## Create a sort order for the rows such that
  ## a stratified sample by groups of nfold
  ## rules out bad draws for CV
  sorted <- order(first_imp$Y,
                  first_imp$city,
                  -weights$m1natwt,
                  first_imp$person)
  ## Sort each data element in that order
  cvData <- list(
    city = first_imp$city[sorted],
    person = first_imp$person[sorted],
    Y = first_imp$Y[sorted],
    X = first_imp$X[sorted,],
    K = ncol(first_imp$X),
    weights = weights[sorted,]
  )
})

# Define the cases we will loop through
cases <- data.frame(
  # Define the run number
  run = 1:(nfold*2*length(cvData)),
  # Define whether uses random effects
  re = rep(c(F,T), each = nfold*length(cvData)),
  # Define the model i that we fit
  i = rep(rep(1:length(cvData), each = nfold), 2),
  # Define the fold f that we omit
  f = rep(1:nfold, 2*length(cvData))
)

## Prepare for parallel computing
cl <- makeCluster(10, outfile = "Output/outfile.txt")
# In PowerShell, can tail that outfile with Get-Content Output/outfile.txt -Wait
registerDoParallel(cl)
set.seed(08544)
t0 <- Sys.time()
cvPred <- foreach(
  run = 1:nrow(cases),
  .combine = "rbind",
  .packages = c("rstan")
) %dorng% {
  print(paste("STARTING RUN",run,"OF",nrow(cases)))
  re <- cases$re[run]
  i <- cases$i[run]
  f <- cases$f[run]
  if (re == T & cvData[[i]]$K > 0) {
    model_code <- stan_code
  } else if (re == T & cvData[[i]]$K == 0) {
    model_code <- stan_code_noCovs
  } else if (re == F & cvData[[i]]$K > 0) {
    model_code <- stan_code_noRE
  } else if (re == F & cvData[[i]]$K == 0) {
    model_code <- stan_code_noREorCovs
  }
  fit <- stan(model_name = "eviction",
              model_code = model_code,
              data = list(
                N_city = 20,
                N_person = 4898,
                N = sum(folds != f),
                city = cvData[[i]]$city[folds != f],
                person = cvData[[i]]$person[folds != f],
                Y = cvData[[i]]$Y[folds != f],
                X = cvData[[i]]$X[folds != f,],
                K = cvData[[i]]$K
              ),
              chains = 1,
              iter = n.warmup + bs.samps,
              warmup = n.warmup,
              save_warmup = F)
  posterior <- rstan::extract(fit)
  X <- cvData[[i]]$X[folds == f,]
  city <- cvData[[i]]$city[folds == f]
  person <- cvData[[i]]$person[folds == f]
  
  if (cvData[[i]]$K > 0) {
    ## Calculate the covariate-based portion of the prediction
    xb <- X %*% t(posterior$beta)
  } else {
    xb <- 0
  }
  
  if (re == T) {
    ## Calculate the intercept portion of the prediction
    intercept <- t(replicate(nrow(X),posterior$alpha)) +
      t(posterior$u_city[,city]) +
      t(posterior$u_person[,person])
  } else {
    intercept <- t(replicate(nrow(X),posterior$alpha))
  }
  
  ## Combine to make the probability of eviction
  phat <- plogis(xb + intercept)
  
  return(data.frame(
    re = re,
    model = i,
    fold = f,
    person = cvData[[i]]$person[folds == f],
    weight = cvData[[i]]$weights[folds == f,"m1natwt"],
    y = cvData[[i]]$Y[folds == f],
    phat
  ))
}
difftime(t0,Sys.time())
stopCluster(cl)
save(cvPred, file = "Intermediate/cvPred.Rdata")
  
